library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(htmltools)
library(devtools)
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.4
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(NbClust)
# Read in Data
nba <- read_csv("~/Fall21/introDS/DS-3001-New/data/nba2020-21.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Player = col_character(),
## Pos = col_character(),
## Tm = col_character()
## )
## i Use `spec()` for the full column specifications.
nba_salaries <- read_csv("~/Fall21/introDS/DS-3001-New/data/nba_salaries_21.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Player = col_character(),
## `2020-21` = col_double()
## )
nba_final = inner_join(nba, nba_salaries)
## Joining, by = "Player"
#View(nba_final)
#Remove NAs
nba_final <- na.omit(nba_final)
#rename salary varaible
nba_final <- rename(nba_final, Salary = `2020-21`)
#Remove special characters
nba_final$Player <- gsub("[^[:alnum:]]", "", nba_final$Player)
#Cast Pos as factor
nba_final$Pos <- fct_collapse(nba_final$Pos,
C = "C",
PF = "PF",
PG = c("PG", "PG-SG"),
SF = c("SF", "SF-PF"),
SG = "SG")
The variables used to cluster the data are GS (games started), MP (minutes played), FG% (field goal percentage), 3P% (three point percentage), 2P% (two point percentage), eFG% (Effective Field Goal Percentage; the formula is (FG + 0.5 * 3P) / FGA), FT% (Free throw percentage), and PTS (points). All the percentages were chosen since they give the relative success out of the total attempts of the statistic rather than just the quantity of attempts or success. Games started, minutes played, and points give career long statistics that demonstrate long term success. Since these variables are on different scales they were normalized on a scale of 0 to 1.
#Select the variables to be included in the cluster
clust_data_nba <- nba_final[, c("GS", "MP", "FG%", "3P%", "2P%", "eFG%", "FT%", "PTS")]
#Normalization Function
normalize <- function(x){
(x - min(x)) / (max(x) - min(x))
}
#select numeric variable names
abc <- names(select_if(clust_data_nba, is.numeric))
#Apply normalizing
clust_data_nba[abc] <- as_tibble(lapply(clust_data_nba[abc], normalize))
#Run the clustering algo with 2 centers
set.seed(1)
kmeans_obj_nba = kmeans(clust_data_nba, centers = 2,
algorithm = "Lloyd")
#View the results
kmeans_obj_nba
## K-means clustering with 2 clusters of sizes 155, 263
##
## Cluster means:
## GS MP FG% 3P% 2P% eFG% FT%
## 1 0.75815170 0.6969126 0.5316692 0.3471419 0.6898684 0.6273779 0.7845161
## 2 0.09906484 0.2881184 0.4906233 0.3192243 0.6632031 0.5856166 0.7362852
## PTS
## 1 0.4652350
## 2 0.1486547
##
## Clustering vector:
## [1] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2
## [38] 1 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
## [75] 1 2 1 2 1 2 2 2 2 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [112] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 2 2 2 1 2 1 2 2 1 2 2 1 2
## [149] 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 2
## [186] 2 2 1 2 2 2 2 1 2 1 1 2 1 2 2 2 1 1 2 1 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 1 2
## [223] 2 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 2 2 2 1 1
## [260] 1 2 1 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [297] 1 2 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 1
## [334] 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1
## [408] 1 1 2 2 1 2 2 2 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 21.20299 46.29813
## (between_SS / total_SS = 50.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
The largest difference in means were the variables games started (GS), Minutes played (MP), and Points scored (PTS). I chose to visualize the clusters with Points scored on the x axis and minutes played on the y axis in 2 dimensions and then added games started on the z axis for the 3 dimensional graph.
#Visualize the output
salary_clusters_nba = as.factor(kmeans_obj_nba$cluster)
ggplot(nba_final, aes(x = PTS,
y = MP,
color = Salary,
shape = salary_clusters_nba)) +
geom_point(size = 6) +
ggtitle("Minutes Played vs Pts Scored for nba players") +
xlab("Points Scored (PTS)") +
ylab("Minutes Played (MP)") +
scale_shape_manual(name = "Cluster",
labels = c("Cluster 1", "Cluster 2"),
values = c("1", "2")) +
theme_light()
There is a clear correlation with the clusters and salary seen in the graph. Cluster 1 is the under performing cluster, and the salary tends to be a darker color therefore they are paid less. Cluster 2 is the better performing cluster, and they have a lighter color overall therefore they are paid more. We will be looking to find players in cluster 2 that are paid less to bring to the team.
#save as a png
ggsave("NbaSalaryClusters.png",
width = 10,
height = 5.62,
units = "in")
The explained variance is about 50%, which could definitely be improved, but overall it is significant enough to say that there are two distinct clusters.
#Evaluate the quality of the clustering
# Inter-cluster variance,
# "betweenss" is the sum of the distances between points
# from different clusters.
num_nba = kmeans_obj_nba$betweenss
# Total variance, "totss" is the sum of the distances
# between all the points in the data set.
denom_nba = kmeans_obj_nba$totss
# Variance accounted for by clusters.
(var_exp_nba = num_nba / denom_nba)
## [1] 0.5060049
#Use the function we created to evaluate several different number of clusters
# The function explained_variance wraps our code for calculating
# the variance explained by clustering.
explained_variance = function(data_in, k){
# Running the kmeans algorithm.
set.seed(1)
kmeans_obj = kmeans(data_in, centers = k, algorithm = "Lloyd", iter.max = 30)
# Variance accounted for by clusters:
# var_exp = intercluster variance / total variance
var_exp = kmeans_obj$betweenss / kmeans_obj$totss
var_exp
}
explained_var_nba = sapply(1:10, explained_variance, data_in = clust_data_nba)
explained_var_nba
## [1] 1.247996e-15 5.060049e-01 5.948167e-01 6.441040e-01 6.930132e-01
## [6] 7.153475e-01 7.390406e-01 7.460434e-01 7.579561e-01 7.793619e-01
#Create a elbow chart of the output
# Data for ggplot2.
elbow_data_nba = data.frame(k = 1:10, explained_var_nba)
# Plotting data.
ggplot(elbow_data_nba,
aes(x = k,
y = explained_var_nba)) +
geom_point(size = 4) + #<- sets the size of the data points
geom_line(size = 1) + #<- sets the thickness of the line
xlab('k') +
ylab('Inter-cluster Variance / Total Variance') +
theme_light()
#Use NbClust to select a number of clusters
# Run NbClust.
(nbclust_obj_nba = NbClust(data = clust_data_nba, method = "kmeans"))
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 7.6120 426.1332 91.1709 36.5429 2484.287 1309559.9 78.2423 67.4996
## 3 1.7281 304.6140 58.8137 33.4956 2844.895 1243489.8 61.3276 55.3656
## 4 0.8079 250.8545 60.4507 29.4360 3160.015 1040196.4 55.5908 48.4931
## 5 1.9489 230.1679 38.6908 28.7133 3451.409 809437.0 33.6598 42.3145
## 6 0.8935 208.6162 38.8708 28.5323 3607.021 803280.4 28.9699 38.6900
## 7 3.2326 196.2497 21.0319 29.3469 3949.433 481948.8 24.4999 35.3544
## 8 1.1279 179.3890 18.5829 29.0077 4064.865 477586.8 21.6039 33.6333
## 9 0.3882 165.9967 27.7850 28.7124 4205.075 432197.4 18.9741 32.1750
## 10 0.9977 160.2708 27.3682 29.3560 4388.476 344069.9 17.2411 30.1283
## 11 7.2157 156.2723 11.3830 30.1154 4540.821 289166.9 13.5681 28.2343
## 12 0.1427 146.7125 25.9116 29.6889 4657.821 260114.7 13.2729 27.4662
## 13 7.4351 144.8712 5.2150 30.5591 4765.235 236095.4 12.1167 25.8184
## 14 0.1646 135.5149 21.8158 29.8075 4850.068 223520.5 12.1366 25.4902
## 15 10.1954 133.8564 5.1149 30.5111 4959.189 197637.1 10.7458 24.1842
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 416.1923 15.2130 0.3193 0.9356 0.4323 1.5255 -93.3563 -1.8088 0.2688
## 3 433.0483 18.5471 0.2860 1.4579 0.2723 1.5286 -80.9202 -1.8152 0.2741
## 4 444.8164 21.1756 0.3137 1.5362 0.2296 2.1261 -88.9810 -2.7689 0.2838
## 5 466.5084 24.2676 0.3080 1.4217 0.2362 1.4919 -43.8538 -1.7239 0.2899
## 6 468.2087 26.5410 0.2889 1.3898 0.2239 1.1471 -14.8763 -0.6694 0.2907
## 7 489.5378 29.0451 0.2759 1.4840 0.2160 0.8771 10.9308 0.7302 0.2708
## 8 497.6069 30.5314 0.2696 1.5375 0.2109 1.2951 -11.8478 -1.1698 0.2603
## 9 508.6330 31.9152 0.2658 1.4611 0.2120 2.6058 -43.1370 -3.1794 0.2525
## 10 526.9319 34.0833 0.2542 1.4656 0.2079 1.2468 -20.1878 -1.0334 0.2486
## 11 540.2173 36.3696 0.2679 1.4621 0.2047 1.5051 -23.1564 -1.7270 0.2400
## 12 551.7180 37.3868 0.2666 1.4754 0.1877 1.7589 -22.0038 -2.2187 0.2309
## 13 557.7145 39.7729 0.3361 1.4433 0.1920 1.1157 -4.7691 -0.5339 0.2273
## 14 569.2706 40.2850 0.3365 1.5515 0.1779 1.5258 -17.5748 -1.7707 0.2193
## 15 577.1240 42.4604 0.3260 1.4295 0.1879 1.2888 -8.9629 -1.1484 0.2127
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 33.7498 0.6378 1.8066 0.4853 0.0775 0.0130 6.0473 0.3721 0.9029
## 3 18.4552 0.5393 0.9801 1.1106 0.0458 0.0149 8.4456 0.3378 0.6370
## 4 12.1233 0.5039 0.5516 1.5202 0.0403 0.0167 8.5139 0.3137 0.4711
## 5 8.4629 0.4836 0.2028 1.9308 0.0805 0.0178 7.4641 0.2939 0.4236
## 6 6.4483 0.4871 0.8099 2.1071 0.0519 0.0190 8.3471 0.2815 0.2744
## 7 5.0506 0.4515 0.7664 2.6397 0.0658 0.0197 8.9887 0.2668 0.2389
## 8 4.2042 0.4327 0.3578 2.9691 0.0658 0.0197 10.0323 0.2604 0.2246
## 9 3.5750 0.4286 0.3255 3.0767 0.0675 0.0201 9.9665 0.2553 0.2137
## 10 3.0128 0.4169 0.5458 3.4006 0.0675 0.0204 9.8946 0.2482 0.2090
## 11 2.5668 0.3954 1.6608 3.9095 0.0736 0.0207 9.6148 0.2418 0.1927
## 12 2.2888 0.3840 0.0806 4.1720 0.0860 0.0212 10.0510 0.2380 0.1796
## 13 1.9860 0.3850 -6.1223 4.2384 0.0956 0.0216 10.1628 0.2324 0.1795
## 14 1.8207 0.3738 0.2149 4.5073 0.0956 0.0217 13.3774 0.2308 0.1730
## 15 1.6123 0.3674 -2.1568 4.7573 0.0761 0.0220 10.6647 0.2252 0.1691
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.7982 68.5152 1.0000
## 3 0.7953 60.2209 1.0000
## 4 0.7631 52.1431 1.0000
## 5 0.7640 41.0886 1.0000
## 6 0.7529 38.0664 1.0000
## 7 0.7417 27.1683 0.6649
## 8 0.6676 25.8879 1.0000
## 9 0.6891 31.5794 1.0000
## 10 0.7549 33.1122 1.0000
## 11 0.6775 32.8385 1.0000
## 12 0.6744 24.6263 1.0000
## 13 0.6806 21.5875 1.0000
## 14 0.6711 24.9985 1.0000
## 15 0.6603 20.5788 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 15.0000 2.0000 3.0000 2.0000 3.0000 7.0 5.0000
## Value_Index 10.1954 426.1332 32.3573 36.5429 360.6084 316969.6 21.9309
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 5.000 13.0000 10.0000 2.0000 2.0000 2.0000
## Value_Index 5.2615 21.692 -1.8739 0.2542 0.9356 0.4323 1.5255
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 6.0000 3.0000 2.0000 2.0000 2.0000
## Value_Index -93.3563 -1.8088 0.2907 15.2946 0.6378 1.8066 0.4853
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 13.0000 0 2.0000 0 15.0000
## Value_Index 0.0956 0 6.0473 0 0.1691
##
## $Best.partition
## [1] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2
## [38] 1 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
## [75] 1 2 1 2 1 2 2 2 2 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [112] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 1 2
## [149] 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 2
## [186] 2 2 1 2 2 2 2 1 2 1 1 2 1 2 2 2 1 1 2 1 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 1 2
## [223] 2 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 2 2 2 1 1
## [260] 1 2 1 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [297] 1 2 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 1
## [334] 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1
## [408] 1 1 2 2 1 2 2 2 1 2 1
# View the output of NbClust.
nbclust_obj_nba
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 7.6120 426.1332 91.1709 36.5429 2484.287 1309559.9 78.2423 67.4996
## 3 1.7281 304.6140 58.8137 33.4956 2844.895 1243489.8 61.3276 55.3656
## 4 0.8079 250.8545 60.4507 29.4360 3160.015 1040196.4 55.5908 48.4931
## 5 1.9489 230.1679 38.6908 28.7133 3451.409 809437.0 33.6598 42.3145
## 6 0.8935 208.6162 38.8708 28.5323 3607.021 803280.4 28.9699 38.6900
## 7 3.2326 196.2497 21.0319 29.3469 3949.433 481948.8 24.4999 35.3544
## 8 1.1279 179.3890 18.5829 29.0077 4064.865 477586.8 21.6039 33.6333
## 9 0.3882 165.9967 27.7850 28.7124 4205.075 432197.4 18.9741 32.1750
## 10 0.9977 160.2708 27.3682 29.3560 4388.476 344069.9 17.2411 30.1283
## 11 7.2157 156.2723 11.3830 30.1154 4540.821 289166.9 13.5681 28.2343
## 12 0.1427 146.7125 25.9116 29.6889 4657.821 260114.7 13.2729 27.4662
## 13 7.4351 144.8712 5.2150 30.5591 4765.235 236095.4 12.1167 25.8184
## 14 0.1646 135.5149 21.8158 29.8075 4850.068 223520.5 12.1366 25.4902
## 15 10.1954 133.8564 5.1149 30.5111 4959.189 197637.1 10.7458 24.1842
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 416.1923 15.2130 0.3193 0.9356 0.4323 1.5255 -93.3563 -1.8088 0.2688
## 3 433.0483 18.5471 0.2860 1.4579 0.2723 1.5286 -80.9202 -1.8152 0.2741
## 4 444.8164 21.1756 0.3137 1.5362 0.2296 2.1261 -88.9810 -2.7689 0.2838
## 5 466.5084 24.2676 0.3080 1.4217 0.2362 1.4919 -43.8538 -1.7239 0.2899
## 6 468.2087 26.5410 0.2889 1.3898 0.2239 1.1471 -14.8763 -0.6694 0.2907
## 7 489.5378 29.0451 0.2759 1.4840 0.2160 0.8771 10.9308 0.7302 0.2708
## 8 497.6069 30.5314 0.2696 1.5375 0.2109 1.2951 -11.8478 -1.1698 0.2603
## 9 508.6330 31.9152 0.2658 1.4611 0.2120 2.6058 -43.1370 -3.1794 0.2525
## 10 526.9319 34.0833 0.2542 1.4656 0.2079 1.2468 -20.1878 -1.0334 0.2486
## 11 540.2173 36.3696 0.2679 1.4621 0.2047 1.5051 -23.1564 -1.7270 0.2400
## 12 551.7180 37.3868 0.2666 1.4754 0.1877 1.7589 -22.0038 -2.2187 0.2309
## 13 557.7145 39.7729 0.3361 1.4433 0.1920 1.1157 -4.7691 -0.5339 0.2273
## 14 569.2706 40.2850 0.3365 1.5515 0.1779 1.5258 -17.5748 -1.7707 0.2193
## 15 577.1240 42.4604 0.3260 1.4295 0.1879 1.2888 -8.9629 -1.1484 0.2127
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 33.7498 0.6378 1.8066 0.4853 0.0775 0.0130 6.0473 0.3721 0.9029
## 3 18.4552 0.5393 0.9801 1.1106 0.0458 0.0149 8.4456 0.3378 0.6370
## 4 12.1233 0.5039 0.5516 1.5202 0.0403 0.0167 8.5139 0.3137 0.4711
## 5 8.4629 0.4836 0.2028 1.9308 0.0805 0.0178 7.4641 0.2939 0.4236
## 6 6.4483 0.4871 0.8099 2.1071 0.0519 0.0190 8.3471 0.2815 0.2744
## 7 5.0506 0.4515 0.7664 2.6397 0.0658 0.0197 8.9887 0.2668 0.2389
## 8 4.2042 0.4327 0.3578 2.9691 0.0658 0.0197 10.0323 0.2604 0.2246
## 9 3.5750 0.4286 0.3255 3.0767 0.0675 0.0201 9.9665 0.2553 0.2137
## 10 3.0128 0.4169 0.5458 3.4006 0.0675 0.0204 9.8946 0.2482 0.2090
## 11 2.5668 0.3954 1.6608 3.9095 0.0736 0.0207 9.6148 0.2418 0.1927
## 12 2.2888 0.3840 0.0806 4.1720 0.0860 0.0212 10.0510 0.2380 0.1796
## 13 1.9860 0.3850 -6.1223 4.2384 0.0956 0.0216 10.1628 0.2324 0.1795
## 14 1.8207 0.3738 0.2149 4.5073 0.0956 0.0217 13.3774 0.2308 0.1730
## 15 1.6123 0.3674 -2.1568 4.7573 0.0761 0.0220 10.6647 0.2252 0.1691
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.7982 68.5152 1.0000
## 3 0.7953 60.2209 1.0000
## 4 0.7631 52.1431 1.0000
## 5 0.7640 41.0886 1.0000
## 6 0.7529 38.0664 1.0000
## 7 0.7417 27.1683 0.6649
## 8 0.6676 25.8879 1.0000
## 9 0.6891 31.5794 1.0000
## 10 0.7549 33.1122 1.0000
## 11 0.6775 32.8385 1.0000
## 12 0.6744 24.6263 1.0000
## 13 0.6806 21.5875 1.0000
## 14 0.6711 24.9985 1.0000
## 15 0.6603 20.5788 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 15.0000 2.0000 3.0000 2.0000 3.0000 7.0 5.0000
## Value_Index 10.1954 426.1332 32.3573 36.5429 360.6084 316969.6 21.9309
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 5.000 13.0000 10.0000 2.0000 2.0000 2.0000
## Value_Index 5.2615 21.692 -1.8739 0.2542 0.9356 0.4323 1.5255
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 6.0000 3.0000 2.0000 2.0000 2.0000
## Value_Index -93.3563 -1.8088 0.2907 15.2946 0.6378 1.8066 0.4853
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 13.0000 0 2.0000 0 15.0000
## Value_Index 0.0956 0 6.0473 0 0.1691
##
## $Best.partition
## [1] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2
## [38] 1 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
## [75] 1 2 1 2 1 2 2 2 2 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [112] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 1 2
## [149] 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 2
## [186] 2 2 1 2 2 2 2 1 2 1 1 2 1 2 2 2 1 1 2 1 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 1 2
## [223] 2 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 2 2 2 1 1
## [260] 1 2 1 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [297] 1 2 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 1
## [334] 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1
## [408] 1 1 2 2 1 2 2 2 1 2 1
# View the output that shows the number of clusters each method recommends.
nbclust_obj_nba$Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 15.0000 2.0000 3.0000 2.0000 3.0000 7.0 5.0000
## Value_Index 10.1954 426.1332 32.3573 36.5429 360.6084 316969.6 21.9309
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 5.000 13.0000 10.0000 2.0000 2.0000 2.0000
## Value_Index 5.2615 21.692 -1.8739 0.2542 0.9356 0.4323 1.5255
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 6.0000 3.0000 2.0000 2.0000 2.0000
## Value_Index -93.3563 -1.8088 0.2907 15.2946 0.6378 1.8066 0.4853
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 13.0000 0 2.0000 0 15.0000
## Value_Index 0.0956 0 6.0473 0 0.1691
#Display the results visually
freq_k_nba = nbclust_obj_nba$Best.nc[1,]
freq_k_nba = data.frame(freq_k_nba)
#View(freq_k_nba)
# Check the maximum number of clusters suggested.
max(freq_k_nba)
## [1] 15
#essentially resets the plot viewer back to default
#dev.off()
# Plot as a histogram.
ggplot(freq_k_nba,
aes(x = freq_k_nba)) +
geom_bar() +
scale_x_continuous(breaks = seq(0, 15, by = 1)) +
scale_y_continuous(breaks = seq(0, 12, by = 1)) +
labs(x = "Number of Clusters",
y = "Number of Votes",
title = "Cluster Analysis")
#Using the recommended number of cluster compare the quality of the model
#with 2 clusters
# Both the elbow graph and the nbc Cluster method recommend two clusters.
#Bonus: Create a 3d version of the output
#Add clusters to nba_final dataframe
nba_final$clusters <- (salary_clusters_nba)
# Use plotly to do a 3d imaging
fig <- plot_ly(nba_final,
type = "scatter3d",
mode="markers",
symbol = ~clusters, symbols = c('circle', 'square'),
x = ~PTS,
y = ~MP,
z = ~GS,
color = ~Salary,
text = ~paste('Player:',Player,
"Position:",Pos,
"Team:", Tm))
fig
Looking at the 3D plot of the two clusters, under performing (circles) and over performing (squares), we can see there is a trend of under performing players getting paid less (dark blue). When trying to find the best performing payers that don’t get paid enough, we are looking for players in the over performing square cluster, who have a lower salary (darker blue) rather than the greens and yellows of the higher paid players in the cluster. The three players I would recommend therefore are:
Each of these players are in the better performing cluster and are towards the high end of minutes played, games started, and points scored, however they all have salaries under 10 million. Players with similar stats as them in the same cluster are normally paid over 20 million. Therefore these are the players that are high performing but not highly paid that you can steal to get the team to the playoffs.